In [37]:
# Python libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as ss
%matplotlib inline
import itertools
import lightgbm as lgbm
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import precision_score, recall_score, confusion_matrix,  roc_curve, precision_recall_curve, accuracy_score, roc_auc_score
from datetime import datetime
import lightgbm as lgbm
import warnings
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as ff
import warnings

from contextlib import contextmanager

@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))

warnings.filterwarnings('ignore') #ignore warning messages
/Users/karandesai/anaconda3/envs/fancy/lib/python3.7/site-packages/lightgbm/__init__.py:46: UserWarning: Starting from version 2.2.1, the library file in distribution wheels for macOS is built by the Apple Clang (Xcode_8.3.1) compiler.
This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.
  "You can install the OpenMP library by the following command: ``brew install libomp``.", UserWarning)

Start with a Clustering Profile of the customers

In [2]:
#model csv
df = pd.read_csv('modelready.csv')
In [3]:
#preprocessing and getting ready to model
#train_test_split first - Churn is the target 

X = df[['gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 
       'totalcharges_tenure_ratio', 'monthlyperc_of_total']]

y = df['Churn']
In [4]:
#first - onehotencode the string categorical values
X = pd.get_dummies(X)

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2,stratify=y,random_state=30)
In [5]:
print(f'Train size:{X_train.shape}')
print(f'Test size:{X_test.shape}')
Train size:(5634, 31)
Test size:(1409, 31)
In [6]:
#I will start by creating a KMeans model to see if there is any value in clustering the customers

#K Means clustering - Choosing the correct value for cluster size
kinert = []
for k in range(1,15):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(X)
    kinert.append(kmeans.inertia_)

# the best value is elbow value. It's 5.
plt.figure(figsize=(15,5))
plt.plot(range(1,15),kinert)
plt.xlabel("number of k (cluster) value")
plt.ylabel("Inertia")
plt.show()

From the Inertia of the models with k neighbours 1-15, we can reasonably assert that there are at least 2 clusters, however, it would be worth looking at 3 clusters because there still seems to be value there.

In [7]:
# Choose k = 2 based on previous visual as well as looking at the inertia and how it seems to reach an elbow around 3
#or 3 k's (will make 2 models ) 

#instantiate model
kmeans2 = KMeans(n_clusters=2)
kmeans3 = KMeans(n_clusters=3)
#fit model
kmeans2.fit(X)
kmeans3.fit(X)

#predict clusters
pred_2 = kmeans2.predict(X)
pred_3 = kmeans3.predict(X)
In [8]:
plt.figure(figsize=(16,16))
#tenure v monthlycharges - points coloured by cluster predictions of model

plt.subplot(4,2,1)
sns.scatterplot(x=X['tenure'],y=X['MonthlyCharges'],hue=pred_2,alpha=0.5)
plt.title('tenure v Monthly Charges colored by 2 clusters (predictions)')
#tenure v monthlycharges - points coloured by churn

plt.subplot(4,2,2)
sns.scatterplot(x=X['tenure'],y=X['MonthlyCharges'],hue=df['Churn'],alpha=0.5)
plt.title('tenure v Monthly Charges colored by actual Churn')

#tenure v totalcharges - cluster 

plt.subplot(4,2,3)
sns.scatterplot(x=X['tenure'],y=X['TotalCharges'],hue=pred_2,alpha=0.5)
plt.title('tenure v Total Charges colored by 2 clusters (predictions)')
#tenure v totalcharges - churn

plt.subplot(4,2,4)
sns.scatterplot(x=X['tenure'],y=X['TotalCharges'],hue=df['Churn'],alpha=0.5)
plt.title('tenure v Total Charges colored by actual Churn')

#tenure v monthly - cluster
plt.subplot(4,2,5)
sns.scatterplot(x=X['tenure'],y=X['MonthlyCharges'],hue=pred_3,alpha=0.5)
plt.title('tenure v Monthly Charges colored by 3 clusters (predictions)')
#tenure v monthly - churn
plt.subplot(4,2,6)
sns.scatterplot(x=X['tenure'],y=X['MonthlyCharges'],hue=df['Churn'],alpha=0.5)
plt.title('tenure v Monthly Charges colored by actual Churn')



plt.subplot(4,2,7)
sns.scatterplot(x=X['tenure'],y=X['TotalCharges'],hue=pred_3,alpha=0.5)
plt.title('tenure v Total Charges colored by 3 clusters (predictions)')
plt.subplot(4,2,8)
sns.scatterplot(x=X['tenure'],y=X['TotalCharges'],hue=df['Churn'],alpha=0.5)
plt.title('tenure v Total Charges colored by actual Churn')



plt.savefig('saved_figs/clusters.png')
plt.tight_layout()
plt.show()

At first glance, it does not seem like the clusters do anything at all. However, I will provide more context into the findings by looking at the data itself to see how it has clustered the customers.

Another note that needs to be mentioned, this is an entirely unsupervised model, there is no scoring to be done because we have no actual clusters to predict against.

Recluster using a scaler/reducer to scale the data and lower dimensionality

In [9]:
#Create a pipeline for modeling
#scaler, reducer and 2 models (2,3 clusters)
scaler = StandardScaler()
reducer = PCA()
model = KMeans(n_clusters = 2)
model2 = KMeans(n_clusters=3)


pipe = make_pipeline(scaler,reducer,model)
pipe2 = make_pipeline(scaler,reducer,model2)



#fit the pipeline 
pipe.fit(X_train)
pipe2.fit(X_train)
/Users/karandesai/anaconda3/envs/fancy/lib/python3.7/site-packages/sklearn/preprocessing/data.py:625: DataConversionWarning: Data with input dtype uint8, int64, float64 were all converted to float64 by StandardScaler.
  return self.partial_fit(X, y)
/Users/karandesai/anaconda3/envs/fancy/lib/python3.7/site-packages/sklearn/base.py:462: DataConversionWarning: Data with input dtype uint8, int64, float64 were all converted to float64 by StandardScaler.
  return self.fit(X, **fit_params).transform(X)
/Users/karandesai/anaconda3/envs/fancy/lib/python3.7/site-packages/sklearn/preprocessing/data.py:625: DataConversionWarning: Data with input dtype uint8, int64, float64 were all converted to float64 by StandardScaler.
  return self.partial_fit(X, y)
/Users/karandesai/anaconda3/envs/fancy/lib/python3.7/site-packages/sklearn/base.py:462: DataConversionWarning: Data with input dtype uint8, int64, float64 were all converted to float64 by StandardScaler.
  return self.fit(X, **fit_params).transform(X)
Out[9]:
Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('kmeans', KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0))])
In [10]:
pipd2 = pipe.predict(X_train)
pipd3 = pipe2.predict(X_train)
/Users/karandesai/anaconda3/envs/fancy/lib/python3.7/site-packages/sklearn/pipeline.py:331: DataConversionWarning: Data with input dtype uint8, int64, float64 were all converted to float64 by StandardScaler.
  Xt = transform.transform(Xt)
/Users/karandesai/anaconda3/envs/fancy/lib/python3.7/site-packages/sklearn/pipeline.py:331: DataConversionWarning: Data with input dtype uint8, int64, float64 were all converted to float64 by StandardScaler.
  Xt = transform.transform(Xt)
In [11]:
#replot previous subplots - ONLY 2 CLUSTERS 

plt.figure(figsize=(12,12))
#tenure v monthlycharges - points coloured by cluster predictions of model

plt.subplot(2,2,1)
sns.scatterplot(x=X_train['tenure'],y=X_train['MonthlyCharges'],hue=pipd2,alpha=0.5)
plt.title('tenure v Monthly Charges colored by 2 clusters (predictions)')
#tenure v monthlycharges - points coloured by churn

plt.subplot(2,2,2)
sns.scatterplot(x=X_train['tenure'],y=X_train['MonthlyCharges'],hue=y_train,alpha=0.5)
plt.title('tenure v Monthly Charges colored by actual Churn')

#tenure v totalcharges - cluster 

plt.subplot(2,2,3)
sns.scatterplot(x=X_train['tenure'],y=X_train['TotalCharges'],hue=pipd2,alpha=0.5)
plt.title('tenure v Total Charges colored by 2 clusters (predictions)')
#tenure v totalcharges - churn

plt.subplot(2,2,4)
sns.scatterplot(x=X_train['tenure'],y=X_train['TotalCharges'],hue=y_train,alpha=0.5)
plt.title('tenure v Total Charges colored by actual Churn')



plt.savefig('saved_figs/clusters-2.png')
plt.tight_layout()
plt.show()
In [12]:
#replot previous subplots - ONLY 3 CLUSTERS 

plt.figure(figsize=(12,12))
#tenure v monthlycharges - points coloured by cluster predictions of model

plt.subplot(2,2,1)
sns.scatterplot(x=X_train['tenure'],y=X_train['MonthlyCharges'],hue=pipd3,alpha=0.5)
plt.title('tenure v Monthly Charges colored by 3 clusters (predictions)')
#tenure v monthlycharges - points coloured by churn

plt.subplot(2,2,2)
sns.scatterplot(x=X_train['tenure'],y=X_train['MonthlyCharges'],hue=y_train,alpha=0.5)
plt.title('tenure v Monthly Charges colored by actual Churn')

#tenure v totalcharges - cluster 

plt.subplot(2,2,3)
sns.scatterplot(x=X_train['tenure'],y=X_train['TotalCharges'],hue=pipd3,alpha=0.5)
plt.title('tenure v Total Charges colored by 3 clusters (predictions)')
#tenure v totalcharges - churn

plt.subplot(2,2,4)
sns.scatterplot(x=X_train['tenure'],y=X_train['TotalCharges'],hue=y_train,alpha=0.5)
plt.title('tenure v Total Charges colored by actual Churn')



plt.savefig('saved_figs/clusters-3.png')
plt.tight_layout()
plt.show()

Scaling the model and applying a dimensionality reducer actually garners a much better result with 3 clusters. We can actually see a pattern, with 3 clusters it seems the model is much better at providing context.

From these plots we can see that there is one set of customers who seem to have low charges, no matter how long their tenure at the bottom of the graphs. The groups that interest us are those in the other 2 clusters where it seems there is 2 types. One that is low tenure and high charge (could they be plan hoppers?). The second are long tenure as well as high charge. Without more context on the customers here, I can't give more of an explanation. However, I will visualize a few other charts to see if there is more to the story.

In [14]:
# See how the 3 groups are different with the data 
km_3 = X_train.copy()
km_3['Groups'] = pipd3
km_3['Churn'] = y_train
In [15]:
#a glance at the dataset grouped by cluster 
#What pops out right away is the 
GROUPS3 = km_3.groupby('Groups').mean().transpose()
In [21]:
#looking at these first few columns - we see that there is a significant difference in the 3 groups
GROUPS3.columns = ['Group1','Group2','Group3']
GROUPS3.head()  
Out[21]:
Group1 Group2 Group3
SeniorCitizen 0.212662 0.033415 0.181678
Partner 0.313718 0.481663 0.702007
Dependents 0.177760 0.419723 0.370561
tenure 15.675731 30.693562 55.380854
PhoneService 0.851055 1.000000 0.905816
In [17]:
g0 = km_3.loc[km_3['Groups']==0]
g1 = km_3.loc[km_3['Groups']==1]
g2 = km_3.loc[km_3['Groups']==2]
In [18]:
for col in km_3.columns[:-2]:
    plt.figure()
    sns.distplot(g0[col],norm_hist=True,bins=10,label='Group1')
    sns.distplot(g1[col],norm_hist=True,bins=10,label='Group2')
    sns.distplot(g2[col],norm_hist=True,bins=10,label='Group3')
    plt.title(f'Distribution of {col} for each group')
    plt.legend()
    plt.show
/Users/karandesai/anaconda3/envs/fancy/lib/python3.7/site-packages/scipy/stats/stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
/Users/karandesai/anaconda3/envs/fancy/lib/python3.7/site-packages/statsmodels/nonparametric/kde.py:488: RuntimeWarning: invalid value encountered in true_divide
  binned = fast_linbin(X, a, b, gridsize) / (delta * nobs)
/Users/karandesai/anaconda3/envs/fancy/lib/python3.7/site-packages/statsmodels/nonparametric/kdetools.py:34: RuntimeWarning: invalid value encountered in double_scalars
  FAC1 = 2*(np.pi*bw/RANGE)**2
/Users/karandesai/anaconda3/envs/fancy/lib/python3.7/site-packages/numpy/core/fromnumeric.py:83: RuntimeWarning: invalid value encountered in reduce
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
/Users/karandesai/anaconda3/envs/fancy/lib/python3.7/site-packages/matplotlib/pyplot.py:514: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)
In [22]:
GROUPS3
Out[22]:
Group1 Group2 Group3
SeniorCitizen 0.212662 0.033415 0.181678
Partner 0.313718 0.481663 0.702007
Dependents 0.177760 0.419723 0.370561
tenure 15.675731 30.693562 55.380854
PhoneService 0.851055 1.000000 0.905816
OnlineSecurity 0.209821 0.999185 0.559959
OnlineBackup 0.256899 1.000000 0.667010
DeviceProtection 0.228084 1.000000 0.702522
TechSupport 0.194805 1.000000 0.592383
StreamingTV 0.312094 1.000000 0.710242
StreamingMovies 0.324675 0.999185 0.719506
PaperlessBilling 0.679383 0.288509 0.658260
MonthlyCharges 68.242898 21.102486 87.586541
TotalCharges 1072.867309 664.342258 4871.401235
totalcharges_tenure_ratio 68.214772 20.978163 87.620299
monthlyperc_of_total 25.411197 17.798907 2.067070
gender_Female 0.492695 0.479218 0.494596
gender_Male 0.507305 0.520782 0.505404
MultipleLines_No 0.525162 0.779951 0.244467
MultipleLines_No phone service 0.148945 0.000000 0.094184
MultipleLines_Yes 0.325893 0.220049 0.661348
InternetService_DSL 0.447646 0.000815 0.427689
InternetService_Fiber optic 0.552354 0.000000 0.572311
InternetService_No 0.000000 0.999185 0.000000
Contract_Month-to-month 0.922484 0.348818 0.202265
Contract_One year 0.065747 0.241239 0.375193
Contract_Two year 0.011769 0.409943 0.422542
PaymentMethod_Bank transfer (automatic) 0.133523 0.214344 0.332990
PaymentMethod_Credit card (automatic) 0.130276 0.217604 0.331961
PaymentMethod_Electronic check 0.527192 0.079870 0.250643
PaymentMethod_Mailed check 0.209010 0.488183 0.084406
Churn 0.456981 0.074165 0.143078
Characteristics of average customer in Group1:
  1. Older age group
  2. Highest Monthly Charge % of Total
  3. 92% of customers in this group are Month-Month
  4. Shortest tenured ~ 15months
  5. 52% pay by electronic check (much higher compared to other 2)
  6. Likely unmarried or childless, (17% have dependents)
  7. 45% of these customers Churn
Characteristics of average customer in Group2:
  1. Youngest age group
  2. Do not have multiple lines
  3. Paperless billing
  4. Do not have multiple lines
  5. Likely young family (42% have dependents)
  6. Lowest churn, 7%
Characteristics of average customer in Group3:
  1. Longest tenured 85months(average)
  2. Middle Aged, 70% have partners
  3. 37% have dependents (so most are likely older)
  4. Multiple lines (likely have homes)
  5. Least likely to be month-month (42% have 2 year plans)
  6. Pay the lowest monthly % of total cost (Highest Total Cost)

Feature Engineering and selection for actual model

In [45]:
data=df.copy()

#create a feature called engaged, Month-Month users would qualify as disengaged
data.loc[:,'Engaged']=1 
data.loc[(data['Contract']=='Month-to-month'),'Engaged']=0

#Young and Not Engaged
data.loc[:,'YandNotE']=0
data.loc[(data['SeniorCitizen']==0) & (data['Engaged']==0),'YandNotE']=1

#Electronic payment
data.loc[:,'ElectCheck']=0 
data.loc[(data['PaymentMethod']=='Electronic check') & (data['Engaged']==0),'ElectCheck']=1

#fiberoptic
data.loc[:,'fiberopt']=1 
data.loc[(data['InternetService']!='Fiber optic'),'fiberopt']=0

#Users that use TV but not internet
data.loc[:,'StreamNoInt']=1 
data.loc[(data['StreamingTV']!='No internet service'),'StreamNoInt']=0

#no online protection or tech support
data.loc[:,'NoProt']=1 
data.loc[(data['OnlineBackup']!='No') | (data['DeviceProtection']!='No') | (data['TechSupport']!='No'),'NoProt']=0

#User of all services
data['TotalServices'] = (data[['PhoneService', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']]== 'Yes').sum(axis=1)
In [47]:
data.head() #can see the 7 new features created that will help eliminate waste
Out[47]:
customerID gender SeniorCitizen Partner Dependents tenure PhoneService MultipleLines InternetService OnlineSecurity OnlineBackup DeviceProtection TechSupport StreamingTV StreamingMovies Contract PaperlessBilling PaymentMethod MonthlyCharges TotalCharges Churn totalcharges_tenure_ratio monthlyperc_of_total Engaged YandNotE ElectCheck fiberopt StreamNoInt NoProt TotalServices
0 7590-VHVEG Female 0 1 0 1 0 No phone service DSL 0 1 0 0 0 0 Month-to-month 1 Electronic check 29.85 29.85 0 29.850000 100.000000 0 1 1 0 0 0 0
1 5575-GNVDE Male 0 0 0 34 1 No DSL 1 0 1 0 0 0 One year 0 Mailed check 56.95 1889.50 0 55.573529 3.014025 1 0 0 0 0 0 0
2 3668-QPYBK Male 0 0 0 2 1 No DSL 1 1 0 0 0 0 Month-to-month 1 Mailed check 53.85 108.15 1 54.075000 49.791956 0 1 0 0 0 0 0
3 7795-CFOCW Male 0 0 0 45 0 No phone service DSL 1 0 1 1 0 0 One year 0 Bank transfer (automatic) 42.30 1840.75 0 40.905556 2.297976 1 0 0 0 0 0 0
4 9237-HQITU Female 0 0 0 2 1 No Fiber optic 0 0 0 0 0 0 Month-to-month 1 Electronic check 70.70 151.65 1 75.825000 46.620508 0 1 1 1 0 0 0
In [48]:
#binning tenure
data['tenure'] = pd.cut(data['tenure'], 3)
In [50]:
#drop columns
data = data.drop(columns = [
                            'Contract',
                            'DeviceProtection', 
                            'Partner'
                           ])
In [51]:
#encode data and scale
#customer id col
Id_col     = ['customerID']
#Target columns
target_col = ["Churn"]
#categorical columns
cat_cols   = data.nunique()[data.nunique() < 10].keys().tolist()
cat_cols   = [x for x in cat_cols if x not in target_col]
#numerical columns
num_cols   = [x for x in data.columns if x not in cat_cols + target_col + Id_col]
#Binary columns with 2 values
bin_cols   = data.nunique()[data.nunique() == 2].keys().tolist()
#Columns more than 2 values
multi_cols = [i for i in cat_cols if i not in bin_cols]

#Label encoding Binary columns
le = LabelEncoder()
for i in bin_cols :
    data[i] = le.fit_transform(data[i])
    
#Duplicating columns for multi value columns
data = pd.get_dummies(data = data,columns = multi_cols )

#Scaling Numerical columns
std = StandardScaler()
scaled = std.fit_transform(data[num_cols])
scaled = pd.DataFrame(scaled,columns=num_cols)

#dropping original values merging scaled values for numerical columns
df_data_og = data.copy()
data = data.drop(columns = num_cols,axis = 1)
data = data.merge(scaled,left_index=True,right_index=True,how = "left")
data = data.drop(['customerID'],axis = 1)
In [53]:
data.head() #left with a sparse matrix with 4 numeric columns, 2 of which were engineered during EDA
Out[53]:
gender SeniorCitizen Dependents PhoneService OnlineSecurity OnlineBackup TechSupport StreamingTV StreamingMovies PaperlessBilling Churn Engaged YandNotE ElectCheck fiberopt ... MultipleLines_Yes InternetService_DSL InternetService_Fiber optic InternetService_No PaymentMethod_Bank transfer (automatic) PaymentMethod_Credit card (automatic) PaymentMethod_Electronic check PaymentMethod_Mailed check StreamNoInt_0 NoProt_0 TotalServices_0 MonthlyCharges TotalCharges totalcharges_tenure_ratio monthlyperc_of_total
0 0 0 0 0 0 1 0 0 0 1 0 0 1 1 0 ... 0 1 0 0 0 0 1 0 1 1 1 -1.160323 -0.992667 -1.151302 2.969664
1 1 0 0 1 1 0 0 0 0 0 0 1 0 0 0 ... 0 1 0 0 0 0 0 1 1 1 1 -0.259629 -0.172198 -0.301458 -0.454642
2 1 0 0 1 1 1 0 0 0 1 1 0 1 0 0 ... 0 1 0 0 0 0 0 1 1 1 1 -0.362660 -0.958122 -0.350966 1.196957
3 1 0 0 0 1 0 1 0 0 0 0 1 0 0 0 ... 0 1 0 0 1 0 0 0 1 1 1 -0.746535 -0.193706 -0.786053 -0.479923
4 0 0 0 1 0 0 0 0 0 1 1 0 1 1 1 ... 0 0 1 0 0 0 1 0 1 1 1 0.197365 -0.938930 0.367602 1.084982

5 rows × 35 columns

In [54]:
#create a correlation plot using pyplot for interactivity
def correlation_plot():
    #correlation
    correlation = data.corr()
    #tick labels
    matrix_cols = correlation.columns.tolist()
    #convert to array
    corr_array  = np.array(correlation)
    trace = go.Heatmap(z = corr_array,
                       x = matrix_cols,
                       y = matrix_cols,
                       colorscale='Viridis',
                       colorbar   = dict() ,
                      )
    layout = go.Layout(dict(title = 'Correlation Matrix for variables',
                            #autosize = False,
                            #height  = 1400,
                            #width   = 1600,
                            margin  = dict(r = 0 ,l = 210,
                                           t = 25,b = 210,
                                         ),
                            yaxis   = dict(tickfont = dict(size = 9)),
                            xaxis   = dict(tickfont = dict(size = 9)),
                           )
                      )
    fig = go.Figure(data = [trace],layout = layout)
    py.iplot(fig)
In [55]:
correlation_plot()
In [56]:
#remove features with co-linearity to prevent overfitting
#Threshold for removing correlated variables
threshold = 0.9

# Absolute value correlation matrix
corr_matrix = data.corr().abs()
corr_matrix.head()

# Upper triangle of correlations
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
upper.head()

# Select columns with correlations above threshold
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

print('There are %d columns to remove :' % (len(to_drop)))

data = data.drop(columns = to_drop)

to_drop
There are 3 columns to remove :
Out[56]:
['MultipleLines_No phone service',
 'InternetService_Fiber optic',
 'totalcharges_tenure_ratio']
In [57]:
correlation_plot()
In [58]:
# Def X and Y
y = np.array(data.Churn.tolist())
data = data.drop('Churn', 1)
X = np.array(data.as_matrix())
In [61]:
# Train_test split
random_state = 42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = random_state)

At this point we are ready to model - Previously during clustering there was alot less preprocessing required

In [62]:
#Def model_performance to create clean looking report after training,testing
def model_performance(model) : 
    #Conf matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    trace1 = go.Heatmap(z = conf_matrix  ,x = ["0 (pred)","1 (pred)"],
                        y = ["0 (true)","1 (true)"],xgap = 2, ygap = 2, 
                        colorscale = 'Viridis', showscale  = False)

    #Show metrics
    tp = conf_matrix[1,1]
    fn = conf_matrix[1,0]
    fp = conf_matrix[0,1]
    tn = conf_matrix[0,0]
    Accuracy  =  ((tp+tn)/(tp+tn+fp+fn))
    Precision =  (tp/(tp+fp))
    Recall    =  (tp/(tp+fn))
    F1_score  =  (2*(((tp/(tp+fp))*(tp/(tp+fn)))/((tp/(tp+fp))+(tp/(tp+fn)))))

    show_metrics = pd.DataFrame(data=[[Accuracy , Precision, Recall, F1_score]])
    show_metrics = show_metrics.T

    colors = ['gold', 'lightgreen', 'lightcoral', 'lightskyblue']
    trace2 = go.Bar(x = (show_metrics[0].values), 
                    y = ['Accuracy', 'Precision', 'Recall', 'F1_score'], text = np.round_(show_metrics[0].values,4),
                    textposition = 'auto', textfont=dict(color='black'),
                    orientation = 'h', opacity = 1, marker=dict(
            color=colors,
            line=dict(color='#000000',width=1.5)))
    
    #Roc curve
    model_roc_auc = round(roc_auc_score(y_test, y_score) , 3)
    fpr, tpr, t = roc_curve(y_test, y_score)
    trace3 = go.Scatter(x = fpr,y = tpr,
                        name = "Roc : " + str(model_roc_auc),
                        line = dict(color = ('rgb(22, 96, 167)'),width = 2), fill='tozeroy')
    trace4 = go.Scatter(x = [0,1],y = [0,1],
                        line = dict(color = ('black'),width = 1.5,
                        dash = 'dot'))
    
    # Precision-recall curve
    precision, recall, thresholds = precision_recall_curve(y_test, y_score)
    trace5 = go.Scatter(x = recall, y = precision,
                        name = "Precision" + str(precision),
                        line = dict(color = ('lightcoral'),width = 2), fill='tozeroy')
    
    #Feature importance
    coefficients  = pd.DataFrame(eval(model).feature_importances_)
    column_data   = pd.DataFrame(list(data))
    coef_sumry    = (pd.merge(coefficients,column_data,left_index= True,
                              right_index= True, how = "left"))
    coef_sumry.columns = ["coefficients","features"]
    coef_sumry    = coef_sumry.sort_values(by = "coefficients",ascending = False)
    coef_sumry = coef_sumry[coef_sumry["coefficients"] !=0]
    trace6 = go.Bar(x = coef_sumry["features"],y = coef_sumry["coefficients"],
                    name = "coefficients", 
                    marker = dict(color = coef_sumry["coefficients"],
                                  colorscale = "Viridis",
                                  line = dict(width = .6,color = "black")))
    
    #Cumulative gain
    pos = pd.get_dummies(y_test).as_matrix()
    pos = pos[:,1] 
    npos = np.sum(pos)
    index = np.argsort(y_score) 
    index = index[::-1] 
    sort_pos = pos[index]
    #cumulative sum
    cpos = np.cumsum(sort_pos) 
    #recall
    recall = cpos/npos 
    #size obs test
    n = y_test.shape[0] 
    size = np.arange(start=1,stop=369,step=1) 
    #proportion
    size = size / n 
    #plots
    model = model
    trace7 = go.Scatter(x = size,y = recall,
                        line = dict(color = ('gold'),width = 2), fill='tozeroy') 
    
    #Subplots
    fig = tls.make_subplots(rows=4, cols=2, print_grid=False,
                          specs=[[{}, {}], 
                                 [{}, {}],
                                 [{'colspan': 2}, None],
                                 [{'colspan': 2}, None]],
                          subplot_titles=('Confusion Matrix',
                                          'Metrics',
                                          'ROC curve'+" "+ '('+ str(model_roc_auc)+')',
                                          'Precision - Recall curve',
                                          'Cumulative gains curve',
                                          'Feature importance'
                                          ))
    
    
    fig.append_trace(trace1,1,1)
    fig.append_trace(trace2,1,2)
    fig.append_trace(trace3,2,1)
    fig.append_trace(trace4,2,1)
    fig.append_trace(trace5,2,2)
    fig.append_trace(trace6,4,1)
    fig.append_trace(trace7,3,1)
    
    fig['layout'].update(showlegend = False, title = '<b>Model performance report</b><br>'+str(model),
                        autosize = False, height = 1500,width = 830,
                        plot_bgcolor = 'black',
                        paper_bgcolor = 'black',
                        margin = dict(b = 195), font=dict(color='white'))
    fig["layout"]["xaxis1"].update(color = 'white')
    fig["layout"]["yaxis1"].update(color = 'white')
    fig["layout"]["xaxis2"].update((dict(range=[0, 1], color = 'white')))
    fig["layout"]["yaxis2"].update(color = 'white')
    fig["layout"]["xaxis3"].update(dict(title = "false positive rate"), color = 'white')
    fig["layout"]["yaxis3"].update(dict(title = "true positive rate"),color = 'white')
    fig["layout"]["xaxis4"].update(dict(title = "recall"), range = [0,1.05],color = 'white')
    fig["layout"]["yaxis4"].update(dict(title = "precision"), range = [0,1.05],color = 'white')
    fig["layout"]["xaxis5"].update(dict(title = "Percentage contacted"),color = 'white')
    fig["layout"]["yaxis5"].update(dict(title = "Percentage positive targeted"),color = 'white')
    fig["layout"]["xaxis6"].update(color = 'white')
    fig["layout"]["yaxis6"].update(color = 'white')
    for i in fig['layout']['annotations']:
        i['font'] = titlefont=dict(color='white', size = 14)
    py.iplot(fig)
    
    
# Cross val metric
def cross_val_metrics(model) :
    scores = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
    for sc in scores:
        scores = cross_val_score(model, X, y, cv = 5, scoring = sc)
        print('[%s] : %0.5f (+/- %0.5f)'%(sc, scores.mean(), scores.std()))
In [63]:
#I will create a LGBM classifier to predict churn
lgbm_clf = lgbm.LGBMClassifier(n_estimators=1500, random_state = 42)

lgbm_clf.fit(X_train, y_train)
lgbm_clf.fit(X_train, y_train)
y_pred = lgbm_clf.predict(X_test)
y_score = lgbm_clf.predict_proba(X_test)[:,1]

model_performance('lgbm_clf')

The above model scored accuracy of 77% - we should aim for above 80% with a supervised learner.

To maximize the score, adjusting hyperparameters of the model is key. Below I have created a RandomSearchCV that will optimize parameters and score them, keeping track of the scores and will stop when it reaches the best score possible.

In [65]:
fit_params = {"early_stopping_rounds" : 50, 
             "eval_metric" : 'binary', 
             "eval_set" : [(X_test,y_test)],
             'eval_names': ['valid'],
             'verbose': 0,
             'categorical_feature': 'auto'}

param_test = {'learning_rate' : [0.01, 0.02, 0.03, 0.04, 0.05, 0.08, 0.1, 0.2, 0.3, 0.4],
              'n_estimators' : [100, 200, 300, 400, 500, 600, 800, 1000, 1500, 2000, 3000, 5000],
              'num_leaves': sp_randint(6, 50), 
              'min_child_samples': sp_randint(100, 500), 
              'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
              'subsample': sp_uniform(loc=0.2, scale=0.8), 
              'max_depth': [-1, 1, 2, 3, 4, 5, 6, 7],
              'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
              'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
              'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}

#number of combinations
n_iter = 200

#intialize lgbm and lunch the search
lgbm_clf = lgbm.LGBMClassifier(random_state=random_state, silent=True, metric='None', n_jobs=4)
grid_search = RandomizedSearchCV(
    estimator=lgbm_clf, param_distributions=param_test, 
    n_iter=n_iter,
    scoring='accuracy',
    cv=5,
    refit=True,
    random_state=random_state,
    verbose=10,n_jobs=2)

grid_search.fit(X_train, y_train, **fit_params)
print('Best params: {} '.format(grid_search.best_params_))

opt_parameters =  grid_search.best_params_
Fitting 5 folds for each of 200 candidates, totalling 1000 fits
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:    1.6s
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:    2.1s
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:    2.8s
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed:    4.2s
[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed:    6.9s
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:    9.8s
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:   10.6s
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   11.6s
[Parallel(n_jobs=2)]: Done  57 tasks      | elapsed:   13.6s
[Parallel(n_jobs=2)]: Done  68 tasks      | elapsed:   16.2s
[Parallel(n_jobs=2)]: Done  81 tasks      | elapsed:   18.0s
[Parallel(n_jobs=2)]: Done  94 tasks      | elapsed:   20.1s
[Parallel(n_jobs=2)]: Done 109 tasks      | elapsed:   23.3s
[Parallel(n_jobs=2)]: Done 124 tasks      | elapsed:   26.5s
[Parallel(n_jobs=2)]: Done 141 tasks      | elapsed:   30.6s
[Parallel(n_jobs=2)]: Done 158 tasks      | elapsed:   40.8s
[Parallel(n_jobs=2)]: Done 177 tasks      | elapsed:   50.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:   58.0s
[Parallel(n_jobs=2)]: Done 217 tasks      | elapsed:  1.1min
[Parallel(n_jobs=2)]: Done 238 tasks      | elapsed:  1.2min
[Parallel(n_jobs=2)]: Done 261 tasks      | elapsed:  1.3min
[Parallel(n_jobs=2)]: Batch computation too fast (0.1996s.) Setting batch_size=2.
[Parallel(n_jobs=2)]: Batch computation too slow (2.7730s.) Setting batch_size=1.
[Parallel(n_jobs=2)]: Done 288 tasks      | elapsed:  1.4min
[Parallel(n_jobs=2)]: Done 313 tasks      | elapsed:  1.5min
[Parallel(n_jobs=2)]: Batch computation too fast (0.1895s.) Setting batch_size=2.
[Parallel(n_jobs=2)]: Batch computation too slow (8.9596s.) Setting batch_size=1.
[Parallel(n_jobs=2)]: Done 340 tasks      | elapsed:  1.7min
[Parallel(n_jobs=2)]: Done 369 tasks      | elapsed:  1.9min
[Parallel(n_jobs=2)]: Done 396 tasks      | elapsed:  2.0min
[Parallel(n_jobs=2)]: Done 425 tasks      | elapsed:  2.1min
[Parallel(n_jobs=2)]: Done 454 tasks      | elapsed:  2.3min
[Parallel(n_jobs=2)]: Done 485 tasks      | elapsed:  2.5min
[Parallel(n_jobs=2)]: Done 516 tasks      | elapsed:  2.6min
[Parallel(n_jobs=2)]: Done 549 tasks      | elapsed:  2.8min
[Parallel(n_jobs=2)]: Done 582 tasks      | elapsed:  2.9min
[Parallel(n_jobs=2)]: Batch computation too fast (0.1968s.) Setting batch_size=2.
[Parallel(n_jobs=2)]: Batch computation too fast (0.1572s.) Setting batch_size=4.
[Parallel(n_jobs=2)]: Batch computation too slow (2.5550s.) Setting batch_size=2.
[Parallel(n_jobs=2)]: Batch computation too slow (2.6977s.) Setting batch_size=1.
[Parallel(n_jobs=2)]: Done 660 tasks      | elapsed:  3.1min
[Parallel(n_jobs=2)]: Done 696 tasks      | elapsed:  3.3min
[Parallel(n_jobs=2)]: Done 733 tasks      | elapsed:  3.5min
[Parallel(n_jobs=2)]: Done 770 tasks      | elapsed:  3.6min
[Parallel(n_jobs=2)]: Done 809 tasks      | elapsed:  3.9min
[Parallel(n_jobs=2)]: Done 848 tasks      | elapsed:  4.0min
[Parallel(n_jobs=2)]: Done 889 tasks      | elapsed:  4.2min
[Parallel(n_jobs=2)]: Done 930 tasks      | elapsed:  4.4min
[Parallel(n_jobs=2)]: Batch computation too fast (0.1878s.) Setting batch_size=2.
[Parallel(n_jobs=2)]: Done 984 tasks      | elapsed:  4.6min
Best params: {'colsample_bytree': 0.6640914962437607, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_samples': 363, 'min_child_weight': 0.01, 'n_estimators': 100, 'num_leaves': 41, 'reg_alpha': 0.1, 'reg_lambda': 100, 'subsample': 0.804289128254122} 
[Parallel(n_jobs=2)]: Done 1000 out of 1000 | elapsed:  4.7min finished
In [66]:
#completed in 4.7 minutes - LGBM is a FAST computation model.

#Rebuild my LGBM with the best parameters from the randomsearch

lgbm_clf = lgbm.LGBMClassifier(**opt_parameters)

lgbm_clf.fit(X_train, y_train)
lgbm_clf.fit(X_train, y_train)
y_pred = lgbm_clf.predict(X_test)
y_score = lgbm_clf.predict_proba(X_test)[:,1]

model_performance('lgbm_clf')

The best_parameters yield a significant increase in precision,recall and f1.

Precision is how precise the model is (how often is the True Value = Predicted) Recall is how often the model predicted the True (1) in the data. This is harder to come by in an imbalanced model like we have here. F1 Score is essentially an average of the 2 and is often used along with precision and recall.

In [67]:
#Finally - Cross validate my model to and take the average of the results as my best model

cross_val_metrics(lgbm_clf)
[accuracy] : 0.80548 (+/- 0.00826)
[precision] : 0.66443 (+/- 0.01847)
[recall] : 0.53932 (+/- 0.01889)
[f1] : 0.59533 (+/- 0.01828)
[roc_auc] : 0.84819 (+/- 0.01003)

Completed